Loading libraries
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(corrplot)
library(caret)
library(data.table)
Reading data
removable_columns <- c("title", "pdb_code", "res_id", "chain_id", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "skeleton_data", "skeleton_cycle_4", "skeleton_diameter", "skeleton_cycle_6", "skeleton_cycle_7", "skeleton_closeness_006_008", "skeleton_closeness_002_004", "skeleton_cycle_3", "skeleton_avg_degree", "skeleton_closeness_004_006", "skeleton_closeness_010_012", "skeleton_closeness_012_014", "skeleton_edges", "skeleton_radius", "skeleton_cycle_8_plus", "skeleton_closeness_020_030", "skeleton_deg_5_plus", "skeleton_closeness_016_018", "skeleton_closeness_008_010", "skeleton_closeness_018_020", "skeleton_average_clustering", "skeleton_closeness_040_050", "skeleton_closeness_014_016", "skeleton_center", "skeleton_closeness_000_002", "skeleton_density", "skeleton_closeness_030_040", "skeleton_deg_4", "skeleton_deg_0", "skeleton_deg_1", "skeleton_deg_2", "skeleton_deg_3", "skeleton_graph_clique_number", "skeleton_nodes", "skeleton_cycles", "skeleton_cycle_5", "skeleton_closeness_050_plus", "skeleton_periphery", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step")
data <- fread("./all_summary.csv", nrows = 10000, header = TRUE, drop = removable_columns)
dim(data)
## [1] 10000 350
Processing missing data
dim(data)
## [1] 10000 350
data <- data %>%
drop_na()
dim(data)
## [1] 8958 350
Deleting chosen ligands
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 8910 350
Data summary
statistics <- data %>%
select(res_name, blob_volume_coverage, blob_volume_coverage_second)
kable(summary(statistics))
|
Length:8910 |
Min. :0.02305 |
Min. :0.00000 |
|
Class :character |
1st Qu.:0.50648 |
1st Qu.:0.00000 |
|
Mode :character |
Median :0.72244 |
Median :0.00000 |
|
NA |
Mean :0.66784 |
Mean :0.02067 |
|
NA |
3rd Qu.:0.86480 |
3rd Qu.:0.00000 |
|
NA |
Max. :1.00000 |
Max. :0.95385 |
dim(data)
## [1] 8910 350
50 most popular ligands
popular_ligands <- data %>%
select(res_name) %>%
count(res_name, sort = TRUE) %>%
slice(1:50)
popular_names_vector <- popular_ligands %>%
pull(res_name)
data <- data %>% filter(res_name %in% popular_names_vector)
dim(data)
## [1] 6239 350
Cardinality of ligands by name
plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
labs(title = "Cardinality of ligands by name")
ggplotly(plot_ligands)
Correlation between variables
data %>%
select_if(is.numeric) %>%
cor %>%
corrplot(method = "circle", tl.col = "black", tl.srt = 45)
## Warning in cor(.): odchylenie standardowe wynosi zero

Distribution of atom and electron count
plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
geom_density(alpha = .3, fill = "#00CECB", color = NA) +
xlab("atom count") +
labs(title = "Atom count distribution")
ggplotly(plot_atom)
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
xlab("electron count") +
labs(title = "Electron count distribution")
ggplotly(plot_electron)
Distribution of part_01 columns
plot_part_data <- data %>%
select(contains("part_01")) %>%
gather(part, value, 1:106)
dim(plot_part_data)
## [1] 661334 2
# plot_part_data_1 <- plot_part_data[1:118926,]
# plot_part_data_2 <- plot_part_data[118927:237852,]
# plot_part_data_3 <- plot_part_data[237853:356778,]
# plot_part_data_4 <- plot_part_data[356779:475704,]
# plot_part_data_5 <- plot_part_data[475705:594630,]
# plot_part_data_6 <- plot_part_data[594631:700342,]
#
# plot_ly(plot_part_data_1, x = plot_part_data_1$part, y = plot_part_data_1$value, type = 'box')
# plot_ly(plot_part_data_2, x = plot_part_data_2$part, y = plot_part_data_2$value, type = 'box')
# plot_ly(plot_part_data_3, x = plot_part_data_3$part, y = plot_part_data_3$value, type = 'box')
# plot_ly(plot_part_data_4, x = plot_part_data_4$part, y = plot_part_data_4$value, type = 'box')
# plot_ly(plot_part_data_5, x = plot_part_data_5$part, y = plot_part_data_5$value, type = 'box')
# plot_ly(plot_part_data_6, x = plot_part_data_6$part, y = plot_part_data_6$value, type = 'box')
Greatest inconsistency in classes
Atom
data %>%
select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
group_by(res_name) %>%
summarise(atom_inconsistency = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
arrange(-atom_inconsistency) %>%
slice(1:10) %>%
kable()
| PLC |
17.1481481 |
| LHG |
4.4615385 |
| C8E |
2.6428571 |
| NDP |
1.7333333 |
| NAP |
1.5090909 |
| PG4 |
1.4225352 |
| MLY |
1.2222222 |
| CME |
1.0000000 |
| MAN |
1.0000000 |
| NAG |
0.9949495 |
Electron
data %>%
select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
group_by(res_name) %>%
summarise(electron_inconsistency = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
arrange(-electron_inconsistency) %>%
slice(1:10) %>%
kable()
| PLC |
114.444444 |
| LHG |
34.096154 |
| C8E |
16.714286 |
| NDP |
11.333333 |
| NAP |
10.654545 |
| PG4 |
9.633803 |
| MLY |
9.370370 |
| CME |
8.000000 |
| MAN |
8.000000 |
| NAG |
7.959596 |
Sekcję sprawdzającą czy na podstawie wartości innych kolumn można przewidzieć liczbę elektronów i atomów oraz z jaką dokładnością można dokonać takiej predykcji; trafność regresji powinna zostać oszacowana na podstawie miar R^2 i RMSE;
Linear regression
Electron count
data_partition <- data %>%
select_if(is.numeric)
set.seed(111)
partition <- createDataPartition(y = data_partition$local_res_atom_non_h_electron_sum, p = 0.7, list = FALSE)
data_train <- data %>%
slice(partition)
data_test <- data %>%
slice(-partition)
dim(data_train)
## [1] 4369 350
dim(data_test)
## [1] 1870 350
# set.seed(111)
# fit <- train(local_res_atom_non_h_electron_sum ~ ., data = data_train, method = "lm")
# set.seed(111)
# prediction <- predict(fit, newdata = data_test)
# fit
Atom count